/***************************************************************************
 *
 * Copyright (C) 2001 International Business Machines
 * All rights reserved.
 *
 * This file is part of the GPFS mmfslinux kernel module.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice, 
 *     this list of conditions and the following disclaimer. 
 *  2. Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution. 
 *  3. The name of the author may not be used to endorse or promote products 
 *     derived from this software without specific prior written
 *     permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *************************************************************************** */
/*
 * Implementation of shared segment for GPFS daemon and GPFS kernel code.
 *
 * Contents:
 *   exp_procfs_version
 *   gpfs_proc_export_init
 *   gpfs_proc_export_term
 *   ss_open
 *   ss_release
 *   ss_fs_read
 *   ss_fs_write
 *   ss_fs_ioctl
 *   ss_init
 *   kxSaveThreadInfo
 *
 *   struct ShMemChunkDesc
 *   unprotectKernelMemory
 *   reprotectKernelMemory
 *   InitSharedKernelMemory
 *   TermSharedKernelMemory
 *   cxiCalcMaxSharedKernelMemory
 *   cxiAllocSharedKernelMemory
 *   cxiMapAllSharedKernelMemory
 *   cxiDeallocAllSharedKernelMemory
 *
 * $Id: ss.c,v 1.73.2.4 2002/05/21 21:44:58 dcraft Exp $
 *
 * $Log: ss.c,v $
 * Revision 1.73.2.4  2002/05/21 21:44:58  dcraft
 * Pull GPFS 1.2.1 up to kernel 2.4.18.
 * mmfsfuncs.Linux must be distributed with /usr/lpp/mmfs/src
 * on developerworks.
 *
 * Revision 1.73.2.3  2002/01/25 22:35:46  mcnabb
 * Use VM_RESERVED bit in the vm_area_struct instead of mlock to pin page
 * pool buffers.
 *
 * Revision 1.73.2.2  2001/12/18 14:33:11  mcnabb
 * kxMapPrivate now works on 2.4.2 and later kernels.
 *
 * Revision 1.73.2.1  2001/12/12 17:00:25  mcnabb
 * Map page pool MAP_PRIVATE and VM_DONTCOPY so child copy on write
 * semantics don't occur and we can get rid of flush_inode_pages().
 *
 * Problems with kxMapPrivate on later kernel levels.  Ifdef it out
 * while I figure out whats wrong.
 *
 * Revision 1.73  2001/10/08 17:22:01  eshel
 * move kxSaveThreadInfo() for ia64 to ss_ia64.c
 *
 * Revision 1.72  2001/09/25 18:21:00  gjertsen
 * Cleanup some IA64 code.
 *
 * Revision 1.71  2001/09/22 20:08:08  dcraft
 * Remove kiobufs from cxiKernelIODescriptor_t.  Use temporary
 * kiobufs for map/unmap.   Remove dead code and dead comments
 * in portability layer and update readmes and license.
 * Fix traceback to appear in mmfs.log file.
 *
 * Revision 1.70  2001/09/21 22:54:32  yuri
 * Remove core dump code
 *
 * Revision 1.69  2001/09/19 23:40:02  eshel
 * Add privilege level support for linux ia64.
 *
 * Revision 1.68  2001/08/27 16:00:21  dcraft
 * Fix a non void return complaint on 2.4.3-12
 *
 * Revision 1.67  2001/08/24 13:10:11  gjertsen
 * Need to flush IA64 PTE entries from TLB when page privilege level is changed.
 *
 * Revision 1.66  2001/08/09 21:11:23  dcraft
 * Modifications to allow running on latest Redhat 7.1 update
 * Kernel version 2.4.3-12.
 * Requires checkout of new site.mcr.proto
 *
 * Revision 1.65  2001/08/06 23:38:24  wyllie
 * Do not hold kernel lock while doing kx... operations
 *
 * Revision 1.64  2001/07/13 19:55:03  wyllie
 * Add sharedMemLimit config parameter to control the size of the GPFS shared
 * segment on Linux.
 *
 * Revision 1.63  2001/07/11 00:35:24  eshel
 * Add a kxNoOp() call to test kx call cost and table of kx call counters.
 *
 * Revision 1.62  2001/05/25 14:48:22  gjertsen
 * Minor fixes to get IA64 code to compile again.
 *
 * Revision 1.61  2001/05/05 01:55:55  tee
 * Map the shared segment if root user reads from /dev/ss0.  Return address of
 * first chunk of memory in user buffer.  This is temporary until a better
 * interface is defined.
 *
 * Revision 1.60  2001/04/25 16:07:30  wyllie
 * Grow UMALLOC pool incrementally on Linux
 *
 * Revision 1.59  2001/04/22 16:36:40  dcraft
 * Reimplement wait queue structure to have a chain of linux
 * wait queue heads and only one waiter per head.  This allows
 * us to control exactly which task will wake up.  Previously
 * the OS was free to select any task on the wait queue head.
 * This gave us incorrect semantics for "wakeup with result"
 * and resulted in crashes stating unexpected EINTR from wait.
 *
 * Revision 1.58  2001/04/21 01:04:25  wyllie
 * Use new infrastructure for allocating and making accessible memory shared
 * between the kernel and the daemon.
 *
 * Revision 1.57  2001/04/19 20:50:20  wyllie
 * Checkpoint partial code to dynamically grow the memory shared between the
 * kernel and the GPFS daemon
 *
 * Revision 1.56  2001/04/10 21:10:47  wyllie
 * Convert cxiIOBuffer.C from C++ to C.
 *
 * Revision 1.55  2001/03/28 05:14:13  schmuck
 * Tweak some trace levels.
 *
 * Revision 1.54  2001/03/24 01:51:09  eshel
 * call elf_tcore_dump() only when regs was not requested
 *
 * Revision 1.53  2001/03/23 00:23:26  eshel
 * minor changes
 *
 * Revision 1.52  2001/03/20 22:58:42  eshel
 * Add option to kxSaveThreadInfo() to get regs.
 *
 * Revision 1.51  2001/03/10 02:24:34  eshel
 * move CHECK_PERM to the right place_
 *
 * Revision 1.50  2001/03/08 17:59:48  eshel
 * add permission checks
 *
 * Revision 1.49  2001/03/07 18:45:34  eshel
 * fix check for access mode
 *
 * Revision 1.48  2001/03/07 18:11:17  eshel
 * Add permission checks. Give full access to super user or users that have write
 * write access to /dev/ss0. Other users with read access are limited to some
 * ioctl operations.
 *
 * Revision 1.47  2001/03/06 17:11:38  wyllie
 * Raise trace level of TRACE_SSEG trace of mapped pages.
 *
 * Revision 1.46  2001/03/01 19:58:31  dixonbp
 * ss_fs_read changes to eliminate extra page table
 *
 * Revision 1.45  2001/02/26 17:33:15  dcraft
 * Pare down unused code pieces.  Run cindent.
 *
 * Revision 1.44  2001/02/19 23:16:14  dcraft
 * split elfdump out so we can control whether we ship it (tcore dump may
 * be available in some distributions).  Must be turned on in site.mcr
 *
 * Revision 1.43  2001/02/12 18:31:45  dixonbp
 * Some of the changes needed to ss_fs_read to handle requests for
 * a segment other than 16M.  Also, check the size against physical
 * limits and downsize it if necessary.  Return the actual size.
 *
 * Revision 1.42  2001/02/07 19:45:24  dixonbp
 * Before ss_get_raddr tries to obtain the address of a page, touch the
 * virtual page first to make sure the page tables have been initialized.
 *
 * Revision 1.41  2001/01/28 20:42:53  dcraft
 * Invoke lcrash for kernel thread tracebacks.  Mod kxDumpTraceBack()
 * to allow a file name for output.  Normal ksymoops is backup if
 * lcrash isn't present.
 *
 * Revision 1.40  2001/01/08 21:22:03  wyllie
 * Use explicit LINUX_KERNEL_VERSION instead of LATEST_LINUX_KERNEL to allow
 * better control over code that depends of the level of the Linux kernel.
 *
 * Revision 1.39  2000/12/20 22:19:28  gjertsen
 * Update copyright info.
 *
 * Revision 1.38  2000/12/19 21:11:58  wyllie
 * Remove assertions and traces about the state of the Linux BKL.  Linux does
 * not keep track of who owns the lock, so these asserts were blowing up on
 * an SMP if the kernel lock happened to be held by the other processor.
 *
 * Revision 1.37  2000/12/15 13:56:49  gjertsen
 * Clean up documentation.
 *
 * Revision 1.36  2000/12/12 16:57:49  eshel
 * Add debugging information.
 *
 * Revision 1.35  2000/12/08 20:37:00  eshel
 * Changes for next release of linux.
 *
 * Revision 1.34  2000/12/05 22:06:53  gjertsen
 * IA64 fixes for latest gnu toolchain w/ glibc2.2 and 2.4.0test10 kernel.
 * Replace vmlist macro calls with full calls to spinlock.
 *
 * Revision 1.33  2000/12/05 19:12:43  eshel
 * Add code for core dump on ia64.
 *
 * Revision 1.32  2000/11/30 22:11:28  eshel
 * Add and move code for ia64.
 *
 * Revision 1.31  2000/11/29 22:13:00  eshel
 * Allow for multiple core dumps.
 *
 * Revision 1.30  2000/11/29 04:38:06  eshel
 * Add code to get thread traceback from a core file.
 *
 * Revision 1.29  2000/11/17 15:21:34  gjertsen
 * Rename or make static several functions.
 * Minor fix for gpl-ksyms.c when MODULES isn't used.
 *
 * Revision 1.28  2000/11/09 01:31:56  eshel
 * Move more code to common routines for ia64 and i386.
 *
 * Revision 1.27  2000/11/08 19:31:18  eshel
 * Add some comments.
 *
 * Revision 1.26  2000/11/08 00:51:13  eshel
 * Fix ia64 compile.
 *
 * Revision 1.25  2000/11/07 23:01:32  eshel
 * Move more common code to ss.c.
 *
 * Revision 1.24  2000/11/07 18:43:34  eshel
 * Start moving common code from ia64/ss.c and i386/ss.c to parent directory.
 *
 * Revision 1.25  2000/11/07 00:16:31  eshel
 * Add code to support remount.
 *
 * Revision 1.24  2000/11/06 19:56:25  gjertsen
 * Linux code cleanup and put in build safeguards.
 *
 * Revision 1.23  2000/11/03 14:25:51  gjertsen
 * Undo last change. Need to update site.mcr file.
 *
 * Revision 1.22  2000/11/02 23:49:17  wyllie
 * Fix include for file in gpl-linux dir
 *
 * Revision 1.21  2000/11/01 00:18:37  eshel
 * Add code to restart mmfsd with out unloading the kernel extension.
 *
 * Revision 1.20  2000/10/28  01:14:28  wyllie
 * Use uppercase hex in TRACEs and printfs consistently
 *
 * Revision 1.19  2000/10/27  22:57:42  eshel
 * Change format of oops output.
 *
 * Revision 1.18  2000/10/26  22:51:37  eshel
 * Don't do kernel trace back for my self or proc init, and make sure that if KDB
 * is used the first line of output starts with "kdba_bt".
 *
 * Revision 1.17  2000/10/26  20:54:55  gjertsen
 * Purge out ugly USE_CWRAPPERS and export module symbols explicitly
 * as the default (in IA64 safe manner). Split out shared seg stuff into
 * another library to avoid defining things twice.
 *
 * Revision 1.16  2000/10/25  20:48:41  dcraft
 * Add cxiSigContext_t
 *
 * Revision 1.15  2000/10/24  14:05:49  gjertsen
 * Clean up linux module specific code so that gpfs can be
 * compiled in the kernel. Rename funcs pulled from kernel.
 *
 * Revision 1.14  2000/10/23  23:21:37  eshel
 * Add REUSE_SHARED_SEGMENT flag if the shared segment should be reused between
 * deamon restarts.
 *
 * Revision 1.13  2000/10/20  20:02:36  eshel
 * Add code for kernel threads traceback.
 *
 * Revision 1.12  2000/10/14  00:20:48  wyllie
 * Document (with DBGASSERTs) whether or not the kernel lock is held at entry
 * to GPFS file, inode, and superblock operations.  Dump whether or not the
 * kernel lock is held on entry to routines whose behavior does not agree
 * with linux/Documentation/filesystems/Locking.
 *
 * Revision 1.11  2000/10/12  20:37:06  eshel
 * Dump core to the same directory as dataStructureDump to a file name that
 * includes the pid and node id.
 *
 * Revision 1.10  2000/10/09  23:50:29  eshel
 * Don't dump zero pages; put GPFS core dump in /tmp/GPFS_core.
 *
 * Revision 1.9  2000/09/29  21:22:44  eshel
 * add Copyright and credits
 *
 * Revision 1.8  2000/09/28  19:41:43  eshel
 * Change core dump to include the shared segment.
 *
 * Revision 1.7  2000/09/16  21:43:12  eshel
 * Remove LATEST_LINUX_KERNEL ifdef code.
 *
 * Revision 1.6  2000/08/29  18:32:18  dcraft
 * Header include cleanup.
 *
 * Revision 1.5  2000/08/28  21:41:55  gjertsen
 * Allow kernel symbols to be explicilty exported.
 *
 * Revision 1.4  2000/08/08  16:39:27  eshel
 * Change SS_PAGE to PAGE_SISE.
 *
 * Revision 1.3  2000/08/01  21:27:38  wyllie
 * Increased a trace level to usually get rid of a common trace.
 *
 * Revision 1.2  2000/07/31  21:59:32  eshel
 * don't use linux/compile.h, might not be there if kernel was not compiled.
 *
 * Revision 1.1  2000/07/28  23:30:10  eshel
 * move ss.c to arch dependent directories
 *
 * Revision 1.22  2000/07/24  18:09:35  eshel
 * Get size of shared segment before remapping it.
 *
 * Revision 1.21  2000/07/24  14:13:14  gjertsen
 * Introduce cxiCopyin, cxiCopyout, and cxiCopyInstr as general kernel ops.
 * Also some minor IA64 build fixes.
 *
 * Revision 1.20  2000/07/21  17:43:52  eshel
 * Add calles to kxDeclarePages an kxUndeclarePages.
 *
 * Revision 1.19  2000/07/11  23:37:03  eshel
 * Change LINUX_2_3_99_PRE6 to LATEST_LINUX_KERNEL for the latest supported kernel
 *
 * Revision 1.18  2000/06/30  23:43:47  schmuck
 * Default build environment for Linux is 2.3.99-pre3.
 * Require -DLINUX_2_3_99_PRE6 when building on 2.3.99-pre6.
 *
 * Revision 1.17  2000/06/12  22:11:23  eshel
 * add a comment
 *
 * Revision 1.16  2000/06/07  23:52:21  eshel
 * fix do_coredump() for LINUX_2_3_99_PRE3
 *
 * Revision 1.15  2000/06/07  20:25:16  dixonbp
 * Remove call to filp_open under LINUX_2_3_99_PRE3 and fix some traces.
 *
 * Revision 1.14  2000/06/07  16:54:11  eshel
 * Change PDEBUG to TRACE
 *
 */

#include <Shark-gpl.h>

#include <linux/types.h>
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/proc_fs.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/signal.h>
#include <linux/locks.h>
#include <linux/vmalloc.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/user.h>
#include <asm/mman.h>
#include <asm/atomic.h>
#include <asm/ptrace.h>
#include <asm/ucontext.h>
#include <asm/elf.h>

#include <linux2gpfs.h>
#include <arch-gpl.h>

#include <cxiSystem.h>
#include <cxiSharedSeg.h>
#include <cxiIOBuffer.h>
#include <Trace.h>

#ifdef GPFS_ARCH_POWER     // ioremap not exported by kernel ???
#define PKMAP_BASE (0xfe000000UL)
#define VMALLOC_END PKMAP_BASE
void set_pte(pte_t *ptep, pte_t pte)
{
#if _PAGE_HASHPTE != 0
  pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
#else
  *ptep = pte;
#endif
}
#endif

const char *gpfs_banner = "GPFS Linux version " UTS_RELEASE "\n";
char *prog_path = NULL;
char *mmfsd_path = NULL;
int  trace_level;

#ifdef PERF_STATS
int ioctl_count[MAX_SS_IOCTL_OPS];
#endif

#ifdef MODULE
MODULE_PARM(prog_path, "s");
MODULE_PARM(mmfsd_path, "s");
MODULE_PARM(trace_level, "i");
#endif /* MODULE */

/* Dynamically assigned major device number for the ioctl interfaces to the
   GPFS kernel modules.  This is the /dev/ss0 device. */
int GPFSIoctlMajorNumber;

  /* Only allow the users with write access or root users */
#define CHECK_PERM if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser()) \
                   {                                                       \
                      return -EPERM;                                       \
                   }

/* Vector table for all routines that can be called with the ss_fs_ioctl. */
int (*ss_ioctl_op[MAX_SS_IOCTL_OPS+1])();

/* Add GPFS information to the /proc file system. */
int
exp_procfs_version(char *buffer, char **start, off_t offset,
                   int length, int *eof, void *data)
{
  off_t pos = 0;
  off_t begin = 0;
  int   len = 0;

  len += sprintf(buffer, "# GPFS version 1.0\n");
  len += sprintf(buffer+len, gpfs_banner);
  *eof = 1;

  *start = buffer + (offset - begin);
  len -= (offset - begin);
  if ( len > length )
    len = length;

  return len;
}

void
gpfs_proc_export_init(void)
{
  if (!proc_mkdir("fs/gpfs", 0))
    return;
  create_proc_read_entry("fs/gpfs/version", 0, 0, exp_procfs_version, NULL);
}

void
gpfs_proc_export_term(void)
{
  remove_proc_entry("fs/gpfs/version", NULL);
  remove_proc_entry("fs/gpfs", NULL);
}


/* Open the character device used for the shared segment. */
int ss_open(struct inode *inode, struct file *filp)
{
  TRACE2(TRACE_SSEG, 2, TRCID_SS_019,
         "ss_open: file 0x%lX inode 0x%lX\n",
         filp, inode);

  /* BKL is held here, but this interface is in transition, according to
     Linux doc */

  MOD_INC_USE_COUNT;

  return 0;          /* success */
}


/* Release/Close the character device used for the shared segment. */
int ss_release(struct inode *inode, struct file *filp)
{
  TRACE1(TRACE_SSEG, 2, TRCID_SS_023,
         "ss_release: file 0x%lX\n", filp);
  /* BKL is held if opened in R/W mode */

  if (MOD_IN_USE)
    MOD_DEC_USE_COUNT;

  return 0;          /* success */
}

/* Map the shared segment and return the address of the first chunk allocated
   (if buffer is big enough to hold it). */
ssize_t 
ss_fs_read(struct file *file, char *buf, size_t nbytes, loff_t *ppos)
{
  struct inode *inode = file->f_dentry->d_inode;
  unsigned int minor = MINOR(inode->i_rdev);
  int rc;
  char *baseP;

  TRACE1(TRACE_SSEG, 2, TRCID_SS_059, "ss_fs_read: called 0x%lX\n", nbytes);
  /* BKL is not held at entry */

  if (minor != 0)
    return -ENODEV;

  /* Only allow the users with write access or root users */
  if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
    return -EPERM;

  /* Map the shared memory */
  rc = cxiMapAllSharedKernelMemory(&baseP);
  if (rc)
    return -rc;

  /* If user buffer is big enough, copy base address of segment there */
  if (nbytes >= sizeof(baseP))
  {
    rc = cxiCopyOut((char *)&baseP, buf, sizeof(baseP));
    if (rc)
      return -EFAULT;
  }
  return 0;
}

/* Was used for debugging. */
ssize_t 
ss_fs_write(struct file *file, const char *buf, size_t nbytes, loff_t *ppos)
{
  struct inode *inode = file->f_dentry->d_inode;
  unsigned int minor = MINOR(inode->i_rdev);

  /* Only allow the users with write access or root users */
  if (!(file->f_mode & FMODE_WRITE) && !cxiIsSuperUser())
    return -EPERM;

  TRACE1(TRACE_SSEG, 0, TRCID_SS_065, "ss_fs_write: called 0x%lX\n", nbytes);
  /* BKL is not held at entry */

  return -EINVAL;
}

#ifdef PERF_STATS
int kxNoOp(int op1, int op2)
{
  int i;

  if (op1 == 1)  // reset all counters
  {
    for (i = 0; i < MAX_SS_IOCTL_OPS; i++)
       ioctl_count[i] = 0;
  }
  if (op2 > 0 && op2 < MAX_SS_IOCTL_OPS)
    return ioctl_count[op2];     // return the requested counter

  return 0;
}
#endif

/* Shared segment and other ioctl calls to the kernel code. */
int 
ss_fs_ioctl(struct inode *inode, struct file *file,
            unsigned int op, unsigned long kx_args)
{
  int len, rc;
  char buf[512];
  struct kxArgs args_cp;
  struct kxArgs *args = (struct kxArgs *)kx_args;

  if (op == kxtraceit)
  {
    CHECK_PERM;

    rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
    if (rc != 0)
      return -1;

    len = 3;
    strncpy(buf, KERN_NOTICE, len);            // KERN_NOTICE = "<5>"
    len += sprintf(buf+len, "dp %X:%d:", cxiGetThreadId(), args_cp.arg3);

    rc = cxiCopyIn((char*)args_cp.arg2, buf+len, args_cp.arg1+1);
    if (rc != 0)
      return -1;

    printk(buf);
    return 0;
  }

  TRACE5(TRACE_KSVFS, 15, TRCID_SS_075,
         "ss_fs_ioctl: op %d opAddr 0x%lX args 0x%lX inode 0x%lX file 0x%lX\n",
         op, ss_ioctl_op[op], kx_args, inode, file);
  /* BKL is held at entry */

#ifdef PERF_STATS
  if (op > 0 && op < MAX_SS_IOCTL_OPS)
    ioctl_count[op]++;
#endif

  switch (op)
  {
    case saveThreadInfo:
      CHECK_PERM;
      rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
      if (rc != 0)
        return -1;
      rc = kxSaveThreadInfo(args_cp.arg1, (void *)args_cp.arg2);
      break;

    case GetPrivLevel:
      CHECK_PERM;
      rc = get_privilege_level();
      break;

    case SetPrivLevel:
      CHECK_PERM;
      rc = set_privilege_level(kx_args);
      break;

    case MapPrivate:
      { 
        char *outAddr;

        CHECK_PERM;
        rc = cxiCopyIn((char*)args, (char *)&args_cp, sizeof(args_cp));
        if (rc != 0)
          return -1;

        rc = kxMapPrivate((char *)args_cp.arg1, (unsigned long)args_cp.arg2,
                          (unsigned long)args_cp.arg3, &outAddr);
        if (rc == 0)
          rc = cxiCopyOut((char*)&outAddr, (char*)args_cp.arg4, sizeof(char*));
  
        if (rc != 0)
          rc = -EFAULT;
        break;
      }

#ifdef PERF_STATS
    case noOp:
      rc = cxiCopyIn((char*)args, (char*)&args_cp, sizeof(args_cp));
      if (rc != 0)
        break;
      if (args_cp.arg1 == 0 && args_cp.arg2 == 0)
      { /* continue to the real noop kxNoOp in ssioctl.C */ }
      else
      {
        rc = kxNoOp((int)args_cp.arg1, (int)args_cp.arg2);
        break;
      }
#endif

    default:
      TRACE1(TRACE_KSVFS, 9, TRCID_SS_077,
             "ss_fs_ioctl: invoking ss_ioctl_op %d\n", op);
      if (ss_ioctl_op[0] != 0)
      {
        unlock_kernel();
        rc = ss_ioctl_op[0](op, kx_args);
        lock_kernel();
      }
      else
        return -1;
      break;
  }
  return rc;
}

/* The other operations, not in the following list, for the device come from
   the bare device. */
struct file_operations ss_fops =
{
  read:    ss_fs_read,
  write:   ss_fs_write,
  ioctl:   ss_fs_ioctl,
  open:    ss_open,
  release: ss_release,
};

/* Initialization of the character device used for the shared segment
   interfaces and other ioctl calls to the kernel code. */
int 
ss_init()
{
  int major;

  GPFSIoctlMajorNumber = 0;
  major = register_chrdev(0, "ss", &ss_fops);

  if (major < 0)
  {
    TRACE1(TRACE_SSEG, 2, TRCID_SS_081,
           "ss_init: unable to get ss0 major rc %d\n", major);
    return -1;
  }

  GPFSIoctlMajorNumber = major;
  TRACE1(TRACE_SSEG, 2, TRCID_SS_083,
         "ss_init: module loaded ss0 major %d\n", GPFSIoctlMajorNumber);

  return 0;
}

/* Management of storage shared between the GPFS daemon and the mmfslinux
   kernel module.  Chunks of memory are allocated on demand by the
   kxAllocSharedKernelMemory call, and are then suballocated by GPFS.  To
   allow free use of pointers, all of this memory is addressed using the
   same virtual addresses whether it is being accessed from the daemon
   process or from a process in kernel mode.  Setting up this addressibility
   requires modifying the protection bits in the Linux page table.  For
   historical reasons dating to the implementation of GPFS on AIX, the
   storage shared between the GPFS daemon process and the kernel is
   frequently referred to collectively as "the shared segment". */

/* Description of each allocated chunk.  Allocated chunks are linked
   together from ChunkListHead. */
struct ShMemChunkDesc
{
  struct list_head chunkList;  /* list linkage */
  char* vaddrP;                /* virtual address of beginning of chunk */
  int len;                     /* length of chunk */
};
struct list_head ChunkListHead;

/* Number of chunks and total size of all chunks */
int NVMallocChunks;
int TotalVMallocBytes;

/* Address of the first chunk allocated.  This value gets returned by
   cxiMapAllSharedKernelMemory as the base of the GPFS shared segment. */
char* FirstVMallocChunkP;

/* Maximum total bytes to allocate, as computed by
   cxiCalcMaxSharedKernelMemory */
int MaxTotalVMallocBytes;

/* Beginning and end of the area of kernel virtual memory used by
   vmalloc/vfree */
UIntPtr VMallocStart;
UIntPtr VMallocEnd;

/* Minimum size of an allocated chunk */
#define MIN_VMALLOC_CHUNK PAGE_SIZE

/* Lock guarding the chunk list */
spinlock_t ChunkListLock;

/* Pointer to slab allocator for ShMemChunkDesc's */
kmem_cache_t* ChunkCacheP = NULL;


/* Make a range of kernel memory addressible by the current process while
   in user mode */
static void unprotectKernelMemory(char* vaddrP, int len, Boolean allocating)
{
  struct mm_struct *mm = current->mm;
  unsigned long vaddr = (unsigned long) vaddrP;
  unsigned long vaddr_start = vaddr;
  pgd_t *pgdP;
  pmd_t *pmdP;
  pte_t *pteP;

  /* Change protection for each page in the range */
  TRACE3(TRACE_SSEG, 9, TRCID_UNPROT_ENTER,
         "unprotectKernelMemory: vaddr 0x%lX len %d allocating %d\n",
         vaddr, len, allocating);
  while (len > 0)
  {
    /* Access the page to make sure all levels of the page table have been
       created.  This this is a kernel address, so page table entries will
       persist once they have been created, since the Linux kernel is not
       pageable. */
    atomic_read((atomic_t*) vaddrP);

    /* Find page table entries for this page */
    pgdP = PGD_OFFSET(mm, vaddr);
    pmdP = pmd_offset(pgdP, vaddr);
    pteP = pte_offset(pmdP, vaddr);

#ifdef GPFS_ARCH_I386
    /* On IA32, set both the pte, and pmd/pgd to allow mmfsd process-level 
     * access to the area.  Since each process has its own page directory
     * (pgd), an attempt to access one of these unprotected pages will be
     * blocked by the protection bit in that process' pgd.  If another process
     * requires access to shared kernel pages, only its pgd need be updated. 
     * pmd_t and pte_t are same size and definition. Thus pte_rdprotect() 
     * (only available macro that hides differences between Suse/Redhat) 
     * is used.
     */
    DBGASSERT(sizeof(pte_t) == sizeof(pmd_t));
    set_pte((pte_t *)pmdP, pte_mkread((*(pte_t *)pmdP)));
    if (allocating)
      set_pte(pteP, pte_mkread(*pteP));

#elif defined(GPFS_ARCH_POWER)
    // XXX Not implemented
    //      pmd_val(*pmdP) = pmd_val(*pmdP) | _PAGE_USER;
    //      if (allocating)
    //        set_pte(pteP, pte_mkread(*pteP));
    printk("power xxx pgdP %x pmdP %x pteP %x\n",
            (int)pgdP, (int)pmdP, (int)pteP);
#elif defined(GPFS_ARCH_IA64)
    /* On IA64, set the protection level of the page when it is created.
     * Nothing to do when allowing access from another process except to
     * set the privilege level of the process. 
     */
    if (allocating)
      pte_val(*pteP) = pte_val(*pteP) | PRIVILEGE_FLAGS;
#endif

    /* Advance to the next page */
    vaddr += PAGE_SIZE;
    vaddrP += PAGE_SIZE;
    len -= PAGE_SIZE;
  }

  /* It is necessary to flush the TLB entries for IA64 to propagate the
   * pte privilege level change. 
   */
  FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
}


/* Make a range of kernel memory no longer addressible by user processes
   while in user mode.  Called just before freeing the memory. */
static void reprotectKernelMemory(char* vaddrP, int len)
{
  struct mm_struct *mm = current->mm;
  unsigned long vaddr = (unsigned long) vaddrP;
  unsigned long vaddr_start = vaddr;
  pgd_t *pgdP;
  pmd_t *pmdP;
  pte_t *pteP;

  /* Change protection for each page in the range */
  TRACE2(TRACE_SSEG, 4, TRCID_REPROT_ENTER,
         "reprotectKernelMemory: vaddr 0x%lX len %d\n",
         vaddr, len);
  while (len > 0)
  {
    /* Access the page to make sure all levels of the page table have been
       created.  This this is a kernel address, so page table entries will
       persist once they have been created, since the Linux kernel is not
       pageable. */
    atomic_read((atomic_t*) vaddrP);

    /* Find page table entries for this page */
    pgdP = PGD_OFFSET(mm, vaddr);
    pmdP = pmd_offset(pgdP, vaddr);
    pteP = pte_offset(pmdP, vaddr);

#ifdef GPFS_ARCH_I386
    /* On IA32, reset the pte and pmd to disallow process-level access.*/
    set_pte((pte_t *)pmdP, pte_rdprotect((*(pte_t *)pmdP))); // see unprotect
    set_pte(pteP, pte_rdprotect(*pteP));

#elif defined(GPFS_ARCH_POWER)
    // XXX??? not implemented

#elif defined(GPFS_ARCH_IA64)
    /* On IA64, reset the protection level of the page. */
    pte_val(*pteP) = (pte_val(*pteP) & ~_PAGE_PL_MASK) | _PAGE_PL_0;
#endif

    /* Advance to the next page */
    vaddr += PAGE_SIZE;
    vaddrP += PAGE_SIZE;
    len -= PAGE_SIZE;
  }

  /* It is necessary to flush the TLB entries for IA64 to propagate the
   * pte privilege level change. 
   */
  FLUSH_TLB_RANGE(mm, vaddr_start, vaddr);
}


/* Initialize the code that manages shared kernel memory */
void InitSharedKernelMemory()
{
  TRACE2(TRACE_SSEG, 1, TRCID_SHKERN_INIT,
         "InitSharedKernelMemory called.  VMALLOC_START 0x%lX "
         "VMALLOC_END 0x%lX\n", VMALLOC_START, VMALLOC_END);
  VMallocStart = (UIntPtr)VMALLOC_START;
  VMallocEnd = (UIntPtr)VMALLOC_END;

  spin_lock_init(&ChunkListLock);

  /* Create a slab allocator for ShMemChunkDesc objects */
  ChunkCacheP = kmem_cache_create("ShMemChunkDesc",
                                  sizeof(struct ShMemChunkDesc),
                                  0 /* offset */,
                                  0 /* flags */,
                                  NULL /* ctor */,
                                  NULL /* dtor */);
  if (ChunkCacheP == NULL)
    cxiPanic("Cannot create ShMemChunkDesc cache\n");

  /* Empty the chunk list */
  INIT_LIST_HEAD(&ChunkListHead);
}


/* Clean up the code that manages shared kernel memory, including freeing
   all allocated chunks. */
void TermSharedKernelMemory()
{
  TRACE0(TRACE_SSEG, 1, TRCID_SHKERN_TERM,
         "TermSharedKernelMemory called\n");
  cxiDeallocAllSharedKernelMemory();

  /* Destroy slab allocator for ShMemChunkDesc objects */
  (void)kmem_cache_destroy(ChunkCacheP);

  /* Unregister the shared segment device driver */
  unregister_chrdev(GPFSIoctlMajorNumber, "ss");
  TRACE1(TRACE_SSEG, 2, TRCID_SSINIT_003,
         "module unloaded major %d\n", GPFSIoctlMajorNumber);
  GPFSIoctlMajorNumber = 0;
}


/* Compute how large the total size of all shared kernel memory regions
   is allowed to grow, based on a desired size.  A value of 0 for
   desiredBytes means to compute the default maximum size. */
int cxiCalcMaxSharedKernelMemory(int desiredBytes, int* actualBytesP)
{
  Int64 physMemSize;
  Int64 effPhysMemSize;
  UIntPtr minAllowedSize = 16*1024*1024;
#if defined(GPFS_ARCH_I386)
  UIntPtr maxAllowedSize = 384*1024*1024;
#elif defined(GPFS_ARCH_POWER)
  UIntPtr maxAllowedSize = 384*1024*1024;
#elif defined(GPFS_ARCH_IA64)
  UIntPtr maxAllowedSize = 2047*1024*1024;
#endif
  UIntPtr actualBytes;
  char* p;
  UIntPtr vmUsed;
  UIntPtr vmRegionReserved;
  UIntPtr maxBytes;

  /* If an explicit number of desired bytes was given, use that value.
     Otherwise, if no number of desired bytes was given (or a value
     smaller than the minimum possible was specified) compute the size based
     on the size of real memory.  The size computed is a fixed fraction of
     real memory (only the first 2G on i386). */
  physMemSize = (Int64)num_physpages * PAGE_SIZE;
# ifdef GPFS_ARCH_I386
    effPhysMemSize = MIN(physMemSize, (Int64)0x80000000);
# else
    effPhysMemSize = physMemSize;
# endif
  if (desiredBytes > 0)
    actualBytes = desiredBytes;
  else
    actualBytes = effPhysMemSize/16;
  actualBytes = MAX(actualBytes, minAllowedSize);

  /* Compute an approximation of how many bytes are already used in the
     vmalloc region.  The variables needed to compute this exactly are not
     exported from the kernel.  If we vmalloc a single page area and see how
     far the allocated area is from the beginning of the vmalloc region, we
     have at least a lower bound on the amount of vmalloc storage already
     used.  If there have been no vfrees, this will yield an accurate
     answer. */
  p = vmalloc(PAGE_SIZE);
  if (p == NULL)
    vmUsed = VMallocEnd - VMallocStart;
  else
  {
    vmUsed = (UIntPtr)p - VMallocStart;
    vfree(p);
  }

  /* Make sure the actual maximum fits within the vmalloc region, taking
     into account memory already used and leaving a reserved area for other
     vmallocs. */
  vmRegionReserved = 16*1024*1024;
  maxBytes = (VMallocEnd-VMallocStart) - (vmUsed+vmRegionReserved);
  actualBytes = MIN(actualBytes, maxBytes);

  /* Make sure the actual maximum does not exceed the maximum possible */
  actualBytes = MIN(actualBytes, maxAllowedSize);

  /* Make sure the actual maximum is less than half of real memory */
  actualBytes = MIN(actualBytes, effPhysMemSize/2);

  /* Round actual maximum down to a multiple of the page size */
  actualBytes = (actualBytes/PAGE_SIZE) * PAGE_SIZE;

  /* If actual maximum is less than the minimum allowed, return 0 */
  if (actualBytes < minAllowedSize)
    actualBytes = 0;

  /* Return result */
  TRACE5(TRACE_SSEG, 1, TRCID_CALC_MAX_SHARED,
         "cxiCalcMaxSharedKernelMemory: actualBytes 0x%lX desiredBytes %d "
         "physMemSize 0x%lX vmUsed 0x%lX maxBytes 0x%lX\n",
         actualBytes, desiredBytes, physMemSize, vmUsed, maxBytes);
  *actualBytesP = (int)actualBytes;
  MaxTotalVMallocBytes = (int)actualBytes;

  return 0;
}


/* Acquire additional memory that is accessible using the same address from
   both kernel code and the GPFS daemon.  Will get at least minBytes.
   Returns the starting virtual address of the area and its actual length. */
int cxiAllocSharedKernelMemory(int minBytes, char** vaddrPP, int* actualBytesP)
{
  struct ShMemChunkDesc* chunkP;
  char* vaddrP;
  int actualBytes;
  pgprot_t prot;

  /* Compute actual number of bytes to allocate */
  /* Allocate memory from vmalloc area */
  if (minBytes <= MIN_VMALLOC_CHUNK)
    actualBytes = MIN_VMALLOC_CHUNK;
  else
    actualBytes = ((minBytes+PAGE_SIZE-1)/PAGE_SIZE) * PAGE_SIZE;
  TRACE2(TRACE_SSEG, 5, TRCID_ALLOC_SHARED_VMALLOC,
         "cxiAllocSharedKernelMemory: vmalloc %d minBytes %d\n",
         actualBytes, minBytes);

  /* Return failure if this allocation would put us over the limit */
  if (TotalVMallocBytes+actualBytes > MaxTotalVMallocBytes)
    return -ENOMEM;

  /* Get a descriptor for the memory to be allocated */
  chunkP = (struct ShMemChunkDesc*) kmem_cache_alloc(ChunkCacheP, SLAB_KERNEL);
  if (chunkP == NULL)
    return -ENOMEM;

  /* Allocate memory */
  /* ?? Instead of calling vmalloc here, we could also do something like:
       pgprot_t prot;
       prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER);
       vaddrP = __vmalloc(actualBytes, GFP_KERNEL | __GFP_HIGHMEM, prot);
    This is an expansion of the vmalloc inline function, with _PAGE_USER
    added to the protection bits so that the PTE entries will already be set
    correctly.  However, a call to unprotectKernelMemory would still be
    needed to set the protection bits in the PMD entries.

    There is also the possibility here of using __GFP_HIGHMEM instead of
    GFP_KERNEL on machines with sufficient high memory.  The storage
    allocated here will never be used as I/O buffers, so high memory would
    be a good place to put it.  This would give I/O buffers a greater chance
    of being allocated below 1G, reducing the need for bounce buffers to do
    I/O. */
#ifdef GPFS_ARCH_POWER
  prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_USER);
  vaddrP = __vmalloc(actualBytes, GFP_KERNEL, prot);
  printk("power xxx vaddr %x actualBytes %x\n", vaddrP, actualBytes);
#else
  vaddrP = vmalloc(actualBytes);
#endif
  if (vaddrP == NULL)
    goto errorExit;
  spin_lock(&ChunkListLock);
  NVMallocChunks += 1;
  TotalVMallocBytes += actualBytes;

  /* Remember address of first chunk allocated */
  if (NVMallocChunks == 1)
    FirstVMallocChunkP = vaddrP;

  /* Fill in chunk descriptor and add it to the proper list */
  chunkP->vaddrP = vaddrP;
  chunkP->len = actualBytes;
  list_add(&chunkP->chunkList, &ChunkListHead);
  spin_unlock(&ChunkListLock);

  /* Make memory just allocated addressible by the current process */
  unprotectKernelMemory(vaddrP, actualBytes, true);

  /* Return results */
  *vaddrPP = vaddrP;
  *actualBytesP = actualBytes;
  TRACE3(TRACE_SSEG, 1, TRCID_ALLOC_SHARED_EXIT,
         "cxiAllocSharedKernelMemory: vaddr 0x%lX actualBytes %d minBytes %d\n",
         vaddrP, actualBytes, minBytes);
  return 0;

errorExit:
  TRACE2(TRACE_SSEG, 0, TRCID_ALLOC_SHARED_FAIL,
         "cxiAllocSharedKernelMemory: failed to allocate %d bytes "
         "minBytes %d\n", actualBytes, minBytes);
  kmem_cache_free(ChunkCacheP, (void*)chunkP);
  return -ENOMEM;
}


/* Map all of the memory allocated by kxAllocSharedKernelMemory into the
   address space of the calling process */
int cxiMapAllSharedKernelMemory(char** basePP)
{
  struct list_head* p;
  struct ShMemChunkDesc* chunkP;

  /* Walk down the list of allocated chunks.  Map each one so that
     this process can access it from user space. */
  spin_lock(&ChunkListLock);
  list_for_each(p, &ChunkListHead)
  {
    chunkP = list_entry(p, struct ShMemChunkDesc, chunkList);
    TRACE1(TRACE_SSEG, 11, TRCID_MAPALL_MULTI,
           "cxiMapAllSharedKernelMemory: chunkP 0x%lX\n", chunkP);
    unprotectKernelMemory(chunkP->vaddrP, chunkP->len, false);
  }
  spin_unlock(&ChunkListLock);

  /* Return address of first chunk allocated; this will be the base of the
     GPFS shared segment */
  *basePP = FirstVMallocChunkP;

  /* If there were no chunks, return ENOENT */
  return (NVMallocChunks > 0) ? 0 : ENOENT;
}


/* Unmap and deallocate all memory allocated by kxAllocSharedKernelMemory */
int cxiDeallocAllSharedKernelMemory()
{
  struct list_head* firstP;
  struct ShMemChunkDesc* chunkP;

  /* Walk down the list of multi page chunks.  Free each one and its
     associated chunk descriptor.  Drop the list lock while freeing
     storage. */
  spin_lock(&ChunkListLock);
  while (!list_empty(&ChunkListHead))
  {
    firstP = ChunkListHead.next;
    list_del(firstP);
    chunkP = list_entry(firstP, struct ShMemChunkDesc, chunkList);
    NVMallocChunks -= 1;
    TotalVMallocBytes -= chunkP->len;
    spin_unlock(&ChunkListLock);
    reprotectKernelMemory(chunkP->vaddrP, chunkP->len);
    TRACE2(TRACE_SSEG, 4, TRCID_FREEALL_VFREE,
           "cxiDeallocAllSharedKernelMemory: vfree 0x%lX chunkP 0x%lX\n",
           chunkP->vaddrP, chunkP);
    vfree(chunkP->vaddrP);
    kmem_cache_free(ChunkCacheP, (void*)chunkP);
    spin_lock(&ChunkListLock);
  }
  FirstVMallocChunkP = NULL;
  spin_unlock(&ChunkListLock);
  return 0;
}

/* This call looks very similar to a MAP_ANONYMOUS mmap() call.  Thats
 * because we used to do mmap() for this region.  Unfortunately when we
 * want MAP_PRIVATE semantics we don't get the results on Linux that we
 * expect.  The trouble starts when the pages of this memory
 * area are marked copy-on-write.  Since this is our buffer pool, when
 * I/O gets done, the old page goes to the child process and the new page goes 
 * to the parent (mmfsd).  Unfortunately, the I/O gets done to the old page 
 * since its physical address was cached in the kiobuf.  
 *
 * One attempt at fixing this was by making the area shared between parent 
 * and child via MAP_SHARED. However, it opens the possibility of a child 
 * process run from system() or popen() being able to stomp on the GPFS buffer 
 * pool.  Additionally putting MAP_SHARED on the the region causes it 
 * to be internally mapped to /dev/zero (apparently it needs some file mapping
 * on this MAP_ANONYMOUS region).  Subsequent madvise() calls saying that
 * we don't need the pages (MADV_DONTNEED) doesn't really free the
 * pages since there is still a hold count due to the kernel /dev/zero 
 * mapping.  Thus the free pages reported by vmstat don't go down even 
 * though we're freeing them from the mmap'd region.
 *
 * This all boils down to a workaround where we MAP_PRIVATE as we 
 * wanted but set the VM_DONTCOPY flag so these mmap pages don't
 * get inherited by child processes.
 *
 * GPFS also needs to make sure that pages of its buffer pool are pinned in
 * memory.  This is necessary because GPFS caches the pointers to the struct
 * page objects returned by map_user_kiobuf.  Linux might steal pages in
 * one of two ways: reclaim_page will steal pages with count <= 1, and
 * swap_out_vma will clear the page table mapping of pages belonging to
 * vm_area_structs that do not have the VM_LOCKED or VM_RESERVED bits set.
 * GPFS prevents the first case because map_user_kiobuf increases page
 * reference counts to 2.  By turning on the VM_RESERVED bit here, we prevent
 * GPFS page pool buffers from being swapped out.  We cannot use mlock to
 * keep pages pinned, because that will create a separate vm_area_struct
 * for each buffer.  See comment in KPinIOBuffer for more details.
 */
int
kxMapPrivate(char *inAddr, unsigned long len, unsigned long prot,
             char **outAddr)
{
  struct mm_struct *mmP;
  struct vm_area_struct *vmaP = NULL;

  mmP = current->mm;
  
#if LINUX_KERNEL_VERSION >= 2040312 || GPFS_ARCH_POWER
  down_write(&mmP->mmap_sem);
#else
  down(&mmP->mmap_sem);
#endif

  *outAddr = (char *)do_mmap(NULL, (unsigned long)inAddr, len, prot, 
                             MAP_PRIVATE | MAP_ANONYMOUS, 0);
  if (*outAddr)
  {
    for (vmaP = mmP->mmap; vmaP != NULL; vmaP = vmaP->vm_next)
      if (vmaP->vm_start == (unsigned long)*outAddr)
      {
        /* We don't want our vm_area_structs merged since we are 
         * about to set a flag that would cross into an area where
         * it might not be good.  For instance if we get merged with
         * the stack vm area then we won't be able to fork since the 
         * stack wouldn't be copied.
         */
        LOGASSERT(vmaP->vm_end == vmaP->vm_start + len);
        vmaP->vm_flags |= VM_DONTCOPY | VM_RESERVED;
        break;
      }

    DBGASSERT(vmaP != NULL);
  }
#if LINUX_KERNEL_VERSION >= 2040312 || GPFS_ARCH_POWER
  up_write(&mmP->mmap_sem);
#else
  up(&mmP->mmap_sem);
#endif

  TRACE5(TRACE_SSEG, 1, TRCID_CXI_MAP_PRIVATE,
         "kxMapPrivate: inAddr 0x%X len %d prot 0x%X outAddr 0x%X vmaP 0x%X\n",
         inAddr, len, prot, *outAddr, vmaP);

  if (*outAddr)
    return 0;
  
  return -EFAULT;
}
